/*	************************************************************	*/
/*	GAUSS code for simulations in:					*/
/*	Boehmke, Frederick J. 2003. "Using Auxiliary Data to Estimate	*/
/*	Selection Bias Models, With an Application to Interest Groups'	*/ 
/*	Use of the Direct Initiative Process." Political Analysis: 	*/
/*	Volume 11 (Summer 2003): 234-254.				*/
/*	************************************************************	*/

/*	************************************************************	*/
/*     	File:		boehmke2003pa-simulate.prg			*/
/*     	Date:   	February 23, 2004				*/
/*      Author: 	Frederick J. Boehmke				*/
/*	Contact:	frederick-boehmke@uiowa.edu			*/
/*      Purpose:	Runs Monte Carlo simulations to compare the	*/
/*			parameters of a naive probit with my two-stage	*/
/*			method.	This file sets rho=0.5 and varies the	*/
/*			size of the auxiliary data set. It can be 	*/
/*			adapted to the other values of rho in the 	*/
/*			paper by changing one line. The data are	*/
/*			saved in an ascii file for analysis.		*/
/*      Output File:	twostgN2.out, twostgN2.asc			*/
/*	************************************************************	*/

/************************************************************************/
/* PROC GENINDVAR: generates the X variables: x1 is selection equation	*/
/*	variable and x2 is interest equation variable.	Returns Nx2	*/
/*	data matrix.							*/
/* Variables Passed:                                                    */
/*      N: 	scaler, sample size 	  				*/
/*      probx1: scaler, probablity X1 is 1				*/
/************************************************************************/

proc(1) = genindvar(probx1,N);

	local x1,x1b,x2,seed;

	x1=rndu(N,1);
	x1=x1 .ge (1-probx1);
	x2=rndn(N,1);

    retp(x1~x2);

endp;


/************************************************************************/
/* PROC GENDEPVAR							*/
/* 	Transform the data to get the correlation set up right.		*/
/* 	Adds correlated errors to alpha + X*beta and outputs two data	*/
/*	matrices.							*/
/* Variables Passed:                                                    */
/*      N: 	scaler, sample size 	  				*/
/************************************************************************/

proc (1) = gendepvar(indvar,N,rho);

	local u1,u2,sigma11,sigma12,V1,V2,V,C,u,y1star,y2star,y1,y2,x1,x2,x1b;

	x1=indvar[.,1];
	x2=indvar[.,2];

	u1=rndn(N,1);
	u2=rndn(N,1);

	sigma11=1;
	sigma12=rho;

	V1=sigma11~sigma12;
	V2=sigma12~sigma11;
	V=V1'~V2';
	C = (chol(V))';
	u=(u1~u2)*C';

	u1=u[.,1];
	u2=u[.,2];

	y1star = -1+x1+u1;
	y2star =  0-x2+u2;

	y1=y1star .ge 0;
	y2=y2star .ge 0;

    retp(y1~x1~y2~x2);

endp;



/************************************************************************/
/* PROC SAMPLE								*/
/* 	This is the random sample I take to estimate first-stage 	*/
/*	selection parameters.	A random sample of size SAMPSIZE is	*/
/*	generated and the probability that x1=1 is calculated.		*/
/* Variables Passed:                                                    */
/*      sampsize: 	scaler, random sample size			*/
/************************************************************************/

proc (1) = sample(sampsize,x1prob);

	local sample,px1;

	sample = (rndu(sampsize,1) .GE (1-x1prob));

    retp(sample);
endp;

/************************************************************************/
/* PROC SELECT								*/
/* Here's the selection part, y2 only observed if y1=1.			*/
/* I sort the data on y1, figure out how many ones there are and then	*/
/*	use this to drop all the observations with y1=0.		*/
/* I then organize the data into some useful matrices.			*/
/* Variables Passed:                                                    */
/*      sampsize: 	scaler, random sample size			*/
/************************************************************************/

proc (1) = select(data,N);

	local responders,num1,selected;

	responders=sortc(data,1);
	num1=sumc(responders[.,1]);
	selected=responders[N-num1+1:N,.];

    retp(selected);

endp;

/************************************************************************/
/* PROC CELLPROBS							*/
/* 	This calculates the cell probabilities using the estimated	*/
/*	probability that x1=1 from the sample and the responses.	*/
/* Variables Passed:                                                    */
/*      px1: 		auxilary sample probability that x1=1		*/
/*      selected: 	the data from the observed responses 		*/
/************************************************************************/

proc (1) = cellprobs(px1,selected,N);

	local py1,py0,px0,px1y1,px0y1,p10,p11;

	py1=sumc(selected[.,1])/N;
	py0=1-py1;
	px0=1-px1;
	px1y1=sumc(selected[.,2])/rows(selected);	@ P(x=1|y=1) @
	px0y1=1-px1y1;
	p10=px0y1*py1/px0;				@ P(y=1|x=0) @
	p11=px1y1*py1/px1;

    retp(p10~p11);
endp;

/************************************************************************/
/* PROC DELTA								*/
/* 	This calculates the estimated variances of the first-stage 	*/
/*	coefficients using the delta method.				*/
/* Variables Passed:                                                    */
/*      px0y1: 		Estimated P(X=0|Y=1) from respondents.		*/
/*      py1: 		Estimated probability of response.		*/
/*	px1:		Estimated P(x=1) from auxilary sample.		*/
/*	N:		Total number of potential respondents.		*/
/*	sampsize:	Auxilary sample size.				*/
/* Returns:                                                    		*/
/*      sigma: 		Estimated 2x2 vcv matrix.			*/
/************************************************************************/


proc (1) = delta(px0y1,py1,px1,N,sampsize);

local sigmaB, sigmaA, sigma, alpha, b11, b22, b33, b12, a11, a12, a13, a21, a22, a23, a1, a2, a3;

	alpha=sampsize/N;

	sigmaA=zeros(2,3);
	sigmaB=zeros(3,3);

	a1=px0y1;
	a2=py1;
	a3=1-px1;

	a11=(a2/a3)*(1/pdfn(cdfni(a1*a2/a3)));
	a12=(a1/a3)*(1/pdfn(cdfni(a1*a2/a3)));
	a13=-((a1*a2)/(a3*a3))*(1/pdfn(cdfni(a1*a2/a3)));
	a21=-(a2/(1-a3))*(1/pdfn(cdfni((1-a1)*a2/(1-a3))))-a11;
	a22=((1-a1)/(1-a3))*(1/pdfn(cdfni((1-a1)*a2/(1-a3))))-a12;
	a23=(((1-a1)*a2)/((1-a3)*(1-a3)))*(1/pdfn(cdfni((1-a1)*a2/(1-a3))))-a13;

	sigmaA[1,.]=a11~a12~a13;
	sigmaA[2,.]=a21~a22~a23;

	b11=(px0y1*(1-px0y1));
	b12=(1-px0y1)*py1 - py1*px1;
	b22=(py1*(1-py1));
	b33=(px1*(1-px1));

	sigmaB[1,.]=(b11~b12~0)/N;
	sigmaB[2,.]=(b12~b22~0)/N; 
	sigmaB[3,3]=(b33)/(alpha*N);

	sigma= sigmaA*sigmaB*(sigmaA');

    retp(sigma);
endp;

/************************************************************************/
/* PROC LOGLIKTWOS							*/
/* 	This is the two-stage log likelihood function.			*/
/* Variables Passed:                                                    */
/*      selected:	the data.					*/
/*      stvals:		starting values for the LLF.			*/
/************************************************************************/

proc logliktwos(beta,data);

	local xb,x,y,wg,w,g,llik,rho,index1,index2,data2,index,a1,b1;

	y=data[.,3];                               @ break up dta @
	x=ones(rows(data),1)~data[.,4];		   @ eqn of interest vars @

	w=ones(rows(data),1)~data[.,2];		   @ selection eqn vars @

	g=data[1,cols(data)-1:cols(data)];	   @ Stage one coefficients @
            
	xb=x*beta[1:2];
	wg=w*g';	    			   @ creates SE index @
	rho=(exp(2*beta[3])-1)/(exp(2*beta[3])+1); @ inverse of Fisher's Z transformation @

	llik=ln(cdfbvn((2*y-1).*xb,wg,(2*y-1)*rho)./cdfn(wg));

    retp( llik );

endp;


/* **********************************************************************/ 
/* **********************************************************************/ 
/* 			BEGIN MAIN PROGRAM SECTION                      */
/* **********************************************************************/ 
/* **********************************************************************/ 

output file = twostgN2.out RESET;
output file = twostgN2.out OFF;

library maxlik;
library cml;
#include cml.ext;
cmlset;

N	= 10000;
x1prob	= 1/3;
draws	= 1000;
rho	= 0.5;
indvar	= zeros(N,2);
data	= zeros(N,4);
selected= zeros(N,4);
twostage= zeros(20*draws,11);
sampsize= 1000;
stvals= {0,0,0};

seed	= 103;
rndseed seed;

level=0;					@ this varies first stage N @

do while level < 20;

    sampsize= 1000 - level*50;

    indvar = genindvar(x1prob,N);

    counter=draws*level+1;

	  do while counter <= draws*(level+1);	@ Starts the loop	@

	  data	   = gendepvar(indvar,N,rho);
	  selected = select(data,N);
	  numresp  = rows(selected);

	  auxdata=sample(sampsize,x1prob);
	  px1	= sumc(auxdata)/sampsize;

	  probs	= cellprobs(px1,selected,N);

						/* For small aux sample size (50) sometimes > 1 */
						/* Model not identified then! 			*/
	    					/* Set to MD and go to next iteration.		*/
	    					/* Remember to drop during analysis.		*/
	  if maxc(probs') < 1;
	      a1 = cdfni(probs[.,1]);
	      b1 = cdfni(probs[.,2]) - a1;

	  else;
	      twostage[counter,.] = -99.*ones(1,cols(twostage));	
	      goto break;

	  endif;

	  py1=numresp/N;
  	  px0y1=1-(sumc(selected[.,2])/numresp);

	  deltavar = delta(px0y1,py1,px1,N,sampsize);

						/* Attach first stage probability estimates	*/
						/* as data to pass them to LLF.			*/

	  dataplus=selected~a1*ones(rows(selected),1)~b1*ones(rows(selected),1);

	   _cml_bounds  = {-100 100 , -100 100 ,  -9 9};
	   _cml_Options = { newton stepbt };
	   _cml_MaxIters= 500;

	  screen OFF;
	  {beta,logl,g,vcv,ret}=cmlprt(cml(dataplus,0,&logliktwos,stvals));
	  screen ON;

	  twostage[counter,1:3]	= beta';
	  twostage[counter,4]	= a1;
	  twostage[counter,5]	= b1;
	  twostage[counter,6]	= sampsize;
	  twostage[counter,7]	= rows(selected);
	  twostage[counter,8]	= sumc(selected[.,3]);
	  twostage[counter,9:10]= diag(deltavar)';
	  twostage[counter,11]	= ret;
	format /rd 5,0;
	  print "Level: " level "Draw: " counter;
	format /mb1 /ros 16,8;

break:

	  counter=counter+1;

	endo;

    output file = twostgN2.out on;
    print;
    print "Seed: " seed;
    print "Auxilary sample size: " sampsize;
    print "Avg responses: " meanc(twostage[(level*draws)+1:(level+1)*draws,7]);
    print "Average coefficient	SE of average";
    meanc(twostage[(level*draws)+1:(level+1)*draws,1:5])~stdc(twostage[(level*draws)+1:(level+1)*draws,1:5]);
    output file = twostgN2.out off;

    level=level+1;

endo;

output file = twostgN2.asc RESET;
	twostage;
output file = twostgN2.asc OFF;

twostgN2=twostage;
save twostgN2;
system;
